Imports¶

In [1]:
import numpy as np

from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split

from scipy.stats import ttest_rel

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import plotly.offline as pyo
pyo.init_notebook_mode()

# Local imports
from ipynb.fs.defs.task3_1 import DatasetManager
from ipynb.fs.defs.task3_2 import ModelManager, plot_bar_data

Code¶

Helper Functions¶

In [2]:
# T-Test
def perform_t_test(model_1, model_2, X, y, model_type, cv=10, model_1_name="Model 1", model_2_name="Model 2"):
    """
    Performs a paired t-tests on two models and returns
    the test statistic, p-value and a visualisation of
    the cross-validation results.
    """
    # Perform n-fold cross validation and obtain relevant scores
    if model_type == "clf":
        model_1_scores = cross_val_score(model_1, X, y, cv=cv, scoring="accuracy")
        model_2_scores = cross_val_score(model_2, X, y, cv=cv, scoring="accuracy")
    elif model_type == "reg":
        model_1_scores = cross_val_score(model_1, X, y, cv=cv, scoring="neg_mean_squared_error")
        model_2_scores = cross_val_score(model_2, X, y, cv=cv, scoring="neg_mean_squared_error")
    
    # Calculating mean of scores
    model_1_mean = np.mean(model_1_scores)
    model_2_mean = np.mean(model_2_scores)
    
    # Perform t-test
    t_stat, p_val = ttest_rel(model_1_scores, model_2_scores)
    
    # Build visualisation of cross-validation scores
    x = [f"Fold {i+1}" for i in range(model_1_scores.size)] + ["Average"]
    mean_bar_plot = plot_bar_data(
        (model_1_name, list(model_1_scores) + [model_1_mean]),
        (model_2_name, list(model_2_scores) + [model_2_mean]),
        x=x,
        title=f"Cross Validation Scores",
        x_label="Folds",
        y_label=f"{'Accuracy' if model_type == 'clf' else 'NMSE'}",
    )
    
    return t_stat, p_val, mean_bar_plot

Model Manager Class¶

In [3]:
# ModelManager class is modified to accomodate new classification and regression models
class ModelManager2(ModelManager):
    def __init__(self, feature_set, targets):
        super().__init__(feature_set, targets)
    
    def train_model(self, model_type, cv_folds=10):
        """
        Modified version of the same function (from Task3-2) for 
        training either a classifiction or regression model and 
        optimising hyperparameters using cross validation. This 
        version works with MLPClassifier and MLPRegressor instead 
        of SVR and SVC.
        """
        assert self._train_and_test_sets != None, "You don't have your training and test sets."
        # Getting training and test data
        X_train = self._train_and_test_sets.get("X_train")
        y_train = self._train_and_test_sets.get("y_train")
        
        X_test = self._train_and_test_sets.get("X_test")
        y_test = self._train_and_test_sets.get("y_test")
        
        # Initialise model
        if model_type == "clf":
            estimator = MLPClassifier(max_iter=2000)
            print("Classifier model initialised...")
        elif model_type == "reg":
            estimator = MLPRegressor(max_iter=2000)
            print("Regression model initialised...")
        
        # Specify hyperparameters ranges to be searched
        parameter_grid = [{
            "learning_rate": ["constant", "invscaling", "adaptive"],
            "alpha": [0.0001, 0.001, 0.01],
        }]
        
        # Model fitting, cross-validation and hyperparameter optimisation using GridSearch
        model = GridSearchCV(
            estimator=estimator, 
            param_grid=parameter_grid, 
            cv=cv_folds,
            refit=True
        )
                
        print("Fitting model and performing cross-validation...")
        model.fit(X_train, y_train)
        print("Model fitting and cross-validation complete...")
        
        # Getting predictions
        print("Making predictions...")
        self._train_preds = model.predict(X_train)
        self._test_preds = model.predict(X_test)
        
        if model_type == "clf":
            self._train_preds_prob = model.predict_proba(X_train)
            self._test_preds_prob = model.predict_proba(X_test)
        
        # Assigning class variables
        self._trained_model = model
        self._best_hps = model.best_params_

Loading Datasets¶

In [4]:
# Productivity dataset; using optimal configuration as determined in Task3-1
gwp_dsm = DatasetManager("gwp_assessment")
gwp_dsm.load_and_preprocess([0,1,2,3], "iterative")
gwp_dsm.create_feature_set(7)
gwp_dsm.scale_feature_set()

# Star dataset; using optimal configuration as determined in Task3-1
star_dsm = DatasetManager("star_assessment")
star_dsm.load_and_preprocess([0,1,8,9,12,16,17], "knn")
star_dsm.create_feature_set(8)
star_dsm.scale_feature_set()
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...
Dataset loaded...
Dataset cleaned..
Dataset encodings..
Dataset numerised...
Missing values imputed...

Getting Targets and Features¶

In [5]:
# Productivity dataset
gwp_features = gwp_dsm.get_scaled_feat_ds()
gwp_targets = gwp_dsm.get_complete_ds()[:, -1]

# Star dataset
star_features = star_dsm.get_scaled_feat_ds()
star_targets = star_dsm.get_complete_ds()[:, -1]

Initialising Model Managers¶

In [6]:
# GWP dataset
gwp_mm = ModelManager2(gwp_features, gwp_targets)

# Star dataset
star_mm = ModelManager2(star_features, star_targets)

Model Evaluation¶

Methodology

  1. Datasets will be split into training and test sets.
  2. Models will be trained on training sets; cross validation will be used to optimise hyperparameters.
  3. Model performance will be evaluated using selected evaluation metrics; the results will then be visualised to paint full picture of a model's performance.
  4. Steps 1-3 will be repeated for several training-test splits (80-20, 75-25, 70-30, 60-40, 50-50) to assess the effect of split ratio on model performance.

Evaluation metrics

  • Productivity dataset: accuracy, precision, recall, F1 score. These metrics are ideal metrics for evaluating classification models as they provide comprehensive insight into a model's performance. Accuracy helps understand the overall effectiveness of the model. However, it can be misleading in imbalanced datasets, which is where precision and recall come in. They provide a more nuanced view of the model's ability to correctly identify positive instances and avoid false positives. The F1 score harmonises precision and recall, offering a single metric that seeks a balance between these two characteristics, making it especially useful when the costs of false positives and false negatives are significantly different.

  • Star dataset: mean squared error (MSE), mean abolute error (MAE), R2 score. These are robust metrics for evaluating regression models, with each illuminating different aspects of model performance. MSE emphasizes larger errors by squaring residuals, making it useful when larger errors are undesirable. MAE provides a more straightforward measure of average error magnitude, regardless of direction. The R2 score complements these by providing a relative measure of how much variance the model can explain, giving a broader picture of model performance beyond just raw error. These combined provide a comprehensive assessment of the model's effectiveness.

Notes

  • Due to the size of the star dataset (as well as the limitations of the machine on which this program was developed) only small subset of the dataset (approximately 2%) will be used to train models.

80-20 Split¶

Splitting Datasets into Train and Test Sets¶

In [7]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.8, test_size=0.2)

# Splitting star dataset
star_mm.split_dataset(train_size=0.016, test_size=0.004)

Model Training¶

In [8]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [9]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [10]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [11]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.
  • The R2 score is particularly low on the test set, which could be an indication of underfitting.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring only slighter better on the training set across all metrics.
  • The scores indicate that the model has neither over- or under- fitted on the training data and can generalise well to new data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2 whilst the average precision for class 0 is much higher.
  • The precision-recall curves show that the model is unable to simulataneously have good recall and good precision for classes 1 and 2.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 0 being the model's weakest class and class 1 being the strongest.

75-25 Split¶

Splitting Datasets into Train and Test Sets¶

In [12]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.75, test_size=0.25)

# Splitting star dataset
star_mm.split_dataset(train_size=0.015, test_size=0.005)

Model Training¶

In [13]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [14]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [15]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [16]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 scores suggest that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.
  • The R2 score is particularly low on the test set, which could be an indication of underfitting.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring roughly 10 percentage points better on the training set across all metrics.
  • The scores indicate that the model has not underfitted on the training data and may in fact have slightly overfitted; but it can still generalise well to new data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is much better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for classes 1 and 2.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 0 and 2 having very similar curves and class 1 having the clear advantage.

70-30 Split¶

Splitting Datasets into Train and Test Sets¶

In [17]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.7, test_size=0.3)

# Splitting star dataset
star_mm.split_dataset(train_size=0.014, test_size=0.006)

Model Training¶

In [18]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [19]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [20]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [21]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 scores suggest that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.
  • The R2 score is particularly low on the test set, which could be an indication of underfitting.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring with very similar performance between the training set and test set across all metrics.
  • The scores indicate that the model has neither over- or under- fitted on the training data and can generalise well to new data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is somewhat better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for any of the classes in the dataset; although it does perform better for class 0.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 2 being the model's weakest class and class 1 being the strongest.

60-40 Split¶

Splitting Datasets into Train and Test Sets¶

In [22]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.6, test_size=0.4)

# Splitting star dataset
star_mm.split_dataset(train_size=0.012, test_size=0.008)

Model Training¶

In [23]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [24]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [25]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [26]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 scores suggest that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model scoring roughly 10 percentage points better on the training set across all metrics.
  • The scores indicate that the model has not underfitted on the training data and may in fact have overfitted slightly; but it can still generalise fairly well to new data.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is somewhat better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for any of the classes in the dataset.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 2 being the model's weakest class and class 1 being the strongest.

50-50 Split¶

Splitting Datasets into Train and Test Sets¶

In [27]:
# Splitting productivity dataset
gwp_mm.split_dataset(train_size=0.5, test_size=0.5)

# Splitting star dataset
star_mm.split_dataset(train_size=0.01, test_size=0.01)

Model Training¶

In [28]:
# Productivity dataset
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [29]:
# Star dataset
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Results¶

In [30]:
# Productivity dataset
gwp_mm.visualise_results_reg()
In [31]:
# Star dataset
encodings = star_dsm.get_encodings()[17]
star_mm.visualise_results_clf(encodings)

Analysis¶

Productivity dataset

  • The model acheives a satisfactorally low score across all error metrics (MSE and MAE) in both the training and test sets.
  • However, the R2 score suggests that the model is unlikely to generalise well to novel data and may therefore require more fine-tuning.

Star dataset

  • The model's accuracy, precision and recall scores are consistent across both the training and test sets, with the model having near-identical performance between the training set and test set across all metrics.
  • The similarity of the scores for the training set and test set could be indicative of underfitting.
  • The precision-recall curves show that the model has a fairly low average precision for classes 1 and 2; the average precision for class 0 is somewhat better.
  • The precision-recall curves also show that the model is unable to simulataneously have good recall and good precision for any of the classes in the dataset.
  • The ROC curves show that the model has a good true-positive rate for all 3 classes, with class 2 being the model's weakest class and class 1 being the strongest.

Analysis of split ratios¶

Productivity dataset

  • The overall accuracy of the model is relatively unaffected by the changing of split ratio. The MSE and MAE scores are fairly consistent across all split ratios with only a slight uptick as split ratio goes from 80-20 to 50-50.
  • The R2 scores are very low for all split ratios but they vary arbitrarily. A robust relationship/correlation cannot be determined; this may require further investigation.
  • 80-20 appears to be the optimal split ratio.

Star dataset

  • Whilst model performance (across all metrics) is adequate for all split ratios, the gap in performance on the training set and test set begins to narrow slight as the split ratio approaches 50-50, with overall performance decreasing as well.; this suggests that as the model is fed less and less training data, is starts to underfit.
  • The true-positive rate for all classes remains fairly consistent as the split ratio approaches 50-50, with the 80-20 split having the slight edge.
  • Conversely, the average precision for classes 1 and 2 picks up slightly as the ratio approaches 50-50; class remains fairly consistent. This would suggest that split ratios closer to 50-50 have a better distribution of all the classes in the dataset.
  • The optimal split ratio appears to be 80-20.

Hypothesis Testing¶

Task3-2 (SVM) Models¶

In [32]:
# Initialising, training and optimising model from Task3-2 using optimal train-test split ratio
# GWP dataset
gwp_mm_old = ModelManager(gwp_features, gwp_targets)
gwp_mm_old.split_dataset(train_size=0.7, test_size=0.3)
gwp_mm_old.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [33]:
# Initialising, training and optimising model from Task3-2 using optimal train-test split ratio
# Star dataset
star_mm_old = ModelManager(star_features, star_targets)
star_mm_old.split_dataset(train_size=0.015, test_size=0.005)
star_mm_old.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Task3-3 (MLP) Models¶

In [34]:
# Initialising, training and optimising model from Task3-3 using optimal training-test split
# GWP dataset
gwp_mm = ModelManager2(gwp_features, gwp_targets)
gwp_mm.split_dataset(train_size=0.8, test_size=0.2)
gwp_mm.train_model("reg", 10)
Regression model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...
In [35]:
# Initialising, training and optimising model from Task3-2 using optimal train-test split
# Star dataset
star_mm = ModelManager2(star_features, star_targets)
star_mm.split_dataset(train_size=0.016, test_size=0.004)
star_mm.train_model("clf", 10)
Classifier model initialised...
Fitting model and performing cross-validation...
Model fitting and cross-validation complete...
Making predictions...

Getting Features and Targets¶

In [36]:
# Productivity dataset
gwp_X, _, gwp_y, _ = train_test_split(
    gwp_features, 
    gwp_targets,
    train_size=0.9999,
    test_size=None,
)

# Star dataset
star_X, _, star_y, _ = train_test_split(
    star_features, 
    star_targets,
    train_size=0.02,
    test_size=None,
)

T-Tests¶

Productivity Dataset¶

In [37]:
# Get models
svm_reg = gwp_mm_old.get_trained_model()
mlp_reg = gwp_mm.get_trained_model()
In [38]:
# Perform t-test
t_stat, p_val, cv_bar_plot = perform_t_test(
    svm_reg, 
    mlp_reg, 
    gwp_X, 
    gwp_y, 
    model_type="reg", 
    cv=5, 
    model_1_name="SVM Regressor", 
    model_2_name="MLP Regressor"
)
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")
t-statistic: 2.090858362757806
p-value: 0.10472907300670688
In [39]:
# Visualise cross-validation scores
cv_bar_plot

Star Dataset¶

In [40]:
# Get models
svm_clf = star_mm_old.get_trained_model()
mlp_clf = star_mm.get_trained_model()
In [41]:
# Perform t-test
t_stat, p_val, cv_bar_plot = perform_t_test(
    svm_clf, 
    mlp_clf, 
    star_X, 
    star_y, 
    model_type="clf", 
    cv=5, 
    model_1_name="SVM Classifier", 
    model_2_name="MLP Classifier"
)
print(f"t-statistic: {t_stat}")
print(f"p-value: {p_val}")
t-statistic: -0.4009104504102879
p-value: 0.7089783348997766
In [42]:
# Visualise cross-validation scores
cv_bar_plot

Markdown Answer¶

Productivity dataset

  • The hypothesis test for this dataset used a significance level of 0.05. The null hypothesis was: there is no significant difference in mean-squared error between the SVM and MLP regressors. The alternative hypothesis was: there is a significant difference in mean-squared error between the SVM and MLP regressors. The hypothesis test yielded a p-value of 0.1047; this value is more than double the significance level and as such the alternative hypothesis was rejected. This conclusion is reaffirmed in the cross-validation data where we can see that there is indeed no significant statistical difference between the models.

Star Dataset

  • The hypothesis test for this dataset used a significance level of 0.05. The null hypothesis was: there is no significant difference in accuracy between the SVM and MLP classifiers. The alternative hypothesis was: there is a significant difference in accuracy between the SVM and MLP classifiers. The hypothesis test yielded a p-value of 0.709; this value is substantially higher the significance level and as such the alternative hypothesis was rejected. This conclusion is reaffirmed in the cross-validation data where we can see that there is indeed no significant statistical difference between the models.
In [ ]: